knitr::opts_chunk$set(echo = TRUE)
This file is a trial on estimating the degrees of freedom $\nu$ via the Kurtosis. Here Kurtosis is a a measure of the "tailedness" of the probability distribution of a real-valued random variable, see also wiki. Basically, the expression of computing sample Kurtosis is: $$\text{Kurtosis}\left[X\right]=\text{E}\left[\left(\frac{X-\mu}{\sigma}\right)^{4}\right]=\frac{\text{E}\left[\left(X-\mu\right)^{4}\right]}{\left(\text{E}\left[\left(X-\mu\right)^{2}\right]\right)^{2}},$$ where $\mu$ and $\sigma$ are the sample mean and standard deviation. Opposite to the degrees of freedom, smaller Kurtosis value means a heavier tail. The kurtosis of a normal distribution equals 3. An excess kurtosis is a metric that compares the kurtosis of a distribution against the kurtosis of a normal distribution, i.e., $$\hat{k} = \text{Kurtosis}\left[X\right] - 3.$$ To correct the bias of the sample Kurtosis, an unbiased estimator is commonly used $$\hat{\text{kurt}}(X) = \frac{T-1}{(T-2)(T-3)}\left((T+1)\hat{k} + 6\right).$$ For a multivariate distribution, in which we index each variable by $i$ ($1\le i \le N$), we define $\hat{\kappa}$ as $$\hat{\kappa} = \max{\left(-\frac{2}{N+2}, \frac{1}{3}\frac{1}{N}\sum_{i=1}^{N}\hat{\text{kurt}}(X_i)\right)}.$$ In the case of a Student $t$ distribution, one can simply use $\hat{\kappa} = \frac{2}{\hat{\nu} - 4}$, which says $$\hat{\nu} = \frac{2}{\hat{\kappa}} + 4$$
First genereate the multivariate Student $t$ data. Let us make it a function for late usage
genX <- function(N = 10, T = round(N*1.1), nu = 5, mu = rep(0, N)) { U <- t(mvtnorm::rmvnorm(n = round(1.1*N), sigma = 0.1*diag(N))) Sigma <- U %*% t(U) + diag(N) Sigma_scale <- (nu-2)/nu * Sigma X <- mvtnorm::rmvt(n = T, delta = mu, sigma = Sigma_scale, df = nu) return(X) }
Then define functions for estimate $\nu$ via kurtosis, note the $X$ will be melt into a vector inside the function.
excess_kurtosis_unbias <- function(x) { x <- as.vector(x) T <- length(x) excess_kurt <- PerformanceAnalytics::kurtosis(x, method = "excess") # mean(x_demean^4) / (mean(x_demean^2))^2 - 3 excess_kurt_unbias <- (T-1) / (T-2) / (T-3) * ((T+1)*excess_kurt + 6) return(excess_kurt_unbias) } excess_kurtosis_unbias_daniel <- function(Xc) { # Xc <- scale(Xc, center = TRUE, scale = TRUE) # this doesn't make any distinguishable difference Xc <- cbind(Xc) T <- nrow(Xc) Xc <- scale(Xc, center = TRUE, scale = FALSE) ki <- colMeans(Xc^4)/colMeans(Xc^2)^2 - 3 k <- mean(ki) k_improved <- (T-1)/((T-2)*(T-3)) * ((T+1)*k + 6) return(k_improved) } tmp <- rnorm(10) excess_kurtosis_unbias(tmp) excess_kurtosis_unbias_daniel(tmp) est_nu_kurtosis <- function(X, comb_fun = mean) { X <- cbind(as.vector(scale(X, center = TRUE, scale = TRUE))) kurt <- apply(X, 2, excess_kurtosis_unbias) kappa <- max(-2/(ncol(X)+2), comb_fun(kurt)/3) nu <- 2 / kappa + 4 # in case of some funny results if (nu < 2) nu <- 2 if (nu > 100) nu <- 100 return(nu) }
repeate <- 1e3 N <- 10 T_pool <- 10:20 res <- list() for (i in 1:length(T_pool)) { T <- T_pool[i] # print(T) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T))}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("T" = T_pool, do.call(rbind, res)), id.vars = "T") data$T <- as.factor(data$T) ggplot(data, aes(x = T, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
It seems that using the kurtosis to estimate $\nu$ gives can probably provide us a meansing reference. However, we notice $\hat{\nu}$ still has some outliers. So we should use it as a regularization target, and the weight should be decreased with $T/N$ increases.
N <- 10 T <- 15 nu_pool <- 3:20 res <- list() for (i in 1:length(nu_pool)) { nu <- nu_pool[i] # print(nu) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T, nu = nu))}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("nu" = nu_pool, do.call(rbind, res)), id.vars = "nu") data$nu <- as.factor(data$nu) ggplot(data, aes(x = nu, y = value)) + geom_boxplot() # ggplot(data, aes(x = nu, y = value)) + geom_violin()
N <- 10 res <- list() for (i in 1:length(T_pool)) { T <- T_pool[i] # print(T) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T), comb_fun = median)}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("T" = T_pool, do.call(rbind, res)), id.vars = "T") data$T <- as.factor(data$T) ggplot(data, aes(x = T, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
N <- 10 T <- 15 res <- list() for (i in 1:length(nu_pool)) { nu <- nu_pool[i] # print(nu) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T, nu = nu), comb_fun = median)}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("nu" = nu_pool, do.call(rbind, res)), id.vars = "nu") data$nu <- as.factor(data$nu) ggplot(data, aes(x = nu, y = value)) + geom_boxplot() # ggplot(data, aes(x = nu, y = value)) + geom_violin()
Well, it seems that using median value is worse than using the simple mean value.
N <- 100 T_pool <- seq(10, 200, 10) res <- list() for (i in 1:length(T_pool)) { T <- T_pool[i] # print(T) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T))}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("T" = T_pool, do.call(rbind, res)), id.vars = "T") data$T <- as.factor(data$T) ggplot(data, aes(x = T, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
It seems with large $N$, the etimators via kurtosis becomes really good.
Now we fix the ratio $T/N$ be $1.1$ and change $N$ from $10$ to $100$ to see the results
N_pool <- seq(10, 100, 10) res <- list() for (i in 1:length(N_pool)) { N <- N_pool[i] # print(T) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = round(N*1.1)))}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("N" = N_pool, do.call(rbind, res)), id.vars = "N") data$N <- as.factor(data$N) ggplot(data, aes(x = N, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
It is clear that when $N$ increase, the estimation performance also improves. An explanation for that could be: the simple average of $\hat{\text{kurt}}$ become more reliable when $N$ goes large.
What if we keeps the $N \times T$ the same but change the $N$ and $T$ accordingly.
allocate_NnT <- function(NxT, TN_ratios = c((1:9)/10, 1:10, NxT-1)) { N <- sqrt(NxT/TN_ratios) T <- N * TN_ratios; return(data.frame(N = round(N), T = round(T))) } # sanity check tmp <- allocate_NnT(500) str(tmp) tmp$N * tmp$T
NxT <- 200 TN_ratios = c((1:9)/10, 1:10, NxT-1) NT_comb <- allocate_NnT(NxT = NxT, TN_ratios = TN_ratios) res <- list() for (i in 1:nrow(NT_comb)) { N <- NT_comb$N[i] T <- NT_comb$T[i] # print(T) # message("T = ", T, "\t N = ", N, "\t T x N = ", T*N) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T))}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("TN_ratios" = TN_ratios, do.call(rbind, res)), id.vars = "TN_ratios") data$TN_ratios <- as.factor(data$TN_ratios) ggplot(data, aes(x = TN_ratios, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
NxT <- 500 NT_comb <- allocate_NnT(NxT = NxT, TN_ratios = TN_ratios) res <- list() for (i in 1:nrow(NT_comb)) { N <- NT_comb$N[i] T <- NT_comb$T[i] # print(T) # message("T = ", T, "\t N = ", N, "\t T x N = ", T*N) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T))}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("TN_ratios" = TN_ratios, do.call(rbind, res)), id.vars = "TN_ratios") data$TN_ratios <- as.factor(data$TN_ratios) ggplot(data, aes(x = TN_ratios, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
NxT <- 1000 NT_comb <- allocate_NnT(NxT = NxT, TN_ratios = TN_ratios) res <- list() for (i in 1:nrow(NT_comb)) { N <- NT_comb$N[i] T <- NT_comb$T[i] # print(T) # message("T = ", T, "\t N = ", N, "\t T x N = ", T*N) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T))}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("TN_ratios" = TN_ratios, do.call(rbind, res)), id.vars = "TN_ratios") data$TN_ratios <- as.factor(data$TN_ratios) ggplot(data, aes(x = TN_ratios, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
NxT <- 1500 NT_comb <- allocate_NnT(NxT = NxT, TN_ratios = TN_ratios) res <- list() for (i in 1:nrow(NT_comb)) { N <- NT_comb$N[i] T <- NT_comb$T[i] # print(T) # message("T = ", T, "\t N = ", N, "\t T x N = ", T*N) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T))}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("TN_ratios" = TN_ratios, do.call(rbind, res)), id.vars = "TN_ratios") data$TN_ratios <- as.factor(data$TN_ratios) ggplot(data, aes(x = TN_ratios, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
Seems $T$ is very essential for the accuracy of estimation.
T <- 10 N_pool <- seq(1, 10, 1) res <- list() for (i in 1:length(N_pool)) { N <- N_pool[i] # print(T) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T), comb_fun = median)}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("N" = N_pool, do.call(rbind, res)), id.vars = "N") data$N <- as.factor(data$N) ggplot(data, aes(x = N, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
T <- 30 N_pool <- seq(1, 10, 1) res <- list() for (i in 1:length(N_pool)) { N <- N_pool[i] # print(T) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T), comb_fun = median)}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("N" = N_pool, do.call(rbind, res)), id.vars = "N") data$N <- as.factor(data$N) ggplot(data, aes(x = N, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
T <- 100 N_pool <- seq(1, 10, 1) res <- list() for (i in 1:length(N_pool)) { N <- N_pool[i] # print(T) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T), comb_fun = median)}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("N" = N_pool, do.call(rbind, res)), id.vars = "N") data$N <- as.factor(data$N) ggplot(data, aes(x = N, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
T <- 200 N_pool <- seq(1, 10, 1) res <- list() for (i in 1:length(N_pool)) { N <- N_pool[i] # print(T) res[[i]] <- sapply(1:repeate, function(idx) {est_nu_kurtosis(genX(N = N, T = T), comb_fun = median)}) } sapply(res, median) library(ggplot2) data <- reshape2::melt(data.frame("N" = N_pool, do.call(rbind, res)), id.vars = "N") data$N <- as.factor(data$N) ggplot(data, aes(x = N, y = value)) + geom_boxplot() # ggplot(data, aes(x = T, y = value)) + geom_violin()
Now the result becomes clear: $T$ is essential for such estimation. $N$ seems not to be that important.
It is not the $T/N$ ratio or $T \times N$. Only $T$ matters, then the bigger $N$ the better.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.